Inspired by: https://github.com/jakevdp/JupyterWorkflow
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import pandas as pd
plt.style.use('seaborn')
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
In [2]:
import jvmthreadparser.parser as jtp
In [3]:
dump = jtp.open_text('threads4.txt', load_thread_content = False)
In [4]:
dump.head()
Out[4]:
In [5]:
dump['Threads'] = 1
threads_by_state = dump.groupby(['DateTime','State']).count().unstack().fillna(0)
threads_by_state.columns = threads_by_state.columns.droplevel()
threads_by_state.head()
Out[5]:
In [6]:
ax = threads_by_state.plot(figsize=(14,12), cmap='Paired', title = 'Thread State by Date')
ax.set_xlabel('Day of Month')
ax.set_ylabel('Number of Threads');
In [7]:
ax = threads_by_state.groupby(threads_by_state.index.hour).mean().plot(figsize=(14,12), cmap='Paired', title='Threads by Hour')
ax.set_xlabel('Hour of the Day (0-23)')
ax.set_ylabel('Mean(Number of Threads)');
In [8]:
ax = threads_by_state.resample('D').mean().plot(figsize=(14,12), cmap = 'Paired')
ax.set_xlabel('Day of Month')
ax.set_ylabel('Mean(Number of Threads)');
In [9]:
by_hour = threads_by_state.resample('H').mean()
pivoted = by_hour.pivot_table("TIMED_WAITING (PARKING)", index = by_hour.index.time, columns = by_hour.index.date).fillna(0)
ax = pivoted.plot(legend=False, alpha = 0.3, color = 'black', title = 'Day Patterns of TIMED_WAITING (PARKING) Threads by Time', figsize=(14,12))
ax.set_xlabel('Time')
ax.set_ylabel('Number of Threads');
In [10]:
X = pivoted.fillna(0).T.values
X.shape
Out[10]:
In [11]:
X2 = PCA(3, svd_solver='full').fit_transform(X)
X2.shape
Out[11]:
In [12]:
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X2[:, 0], X2[:, 1], X2[:, 2])
ax.set_title('PCA Dimensionality Reduction (3 Dimensions)')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3');
In [13]:
gmm = GaussianMixture(3).fit(X)
labels = gmm.predict(X)
In [14]:
fig = plt.figure(figsize=(14,10))
ax = fig.add_subplot(111, projection='3d')
cMap = ListedColormap(['green', 'blue','red'])
p = ax.scatter(X2[:, 0], X2[:, 1], X2[:, 2], c=labels, cmap=cMap)
ax.set_title('Unsupervised Clustering (3 Clusters with Colors)')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3');
colorbar = fig.colorbar(p, ticks=np.linspace(0,2,3))
colorbar.set_label('Cluster')
In [15]:
fig, ax = plt.subplots(1, 3, figsize=(14, 6))
pivoted.T[labels == 0].T.plot(legend=False, alpha=0.4, ax=ax[0]);
pivoted.T[labels == 1].T.plot(legend=False, alpha=0.4, ax=ax[1]);
pivoted.T[labels == 2].T.plot(legend=False, alpha=0.4, ax=ax[2]);
ax[0].set_title('Cluster 0')
ax[0].set_xlabel('Time')
ax[0].set_ylabel('Number of Threads')
ax[1].set_title('Cluster 1');
ax[1].set_xlabel('Time')
ax[2].set_title('Cluster 2');
ax[2].set_xlabel('Time')
Out[15]:
In [16]:
dayofweek = pd.DatetimeIndex(pivoted.columns).dayofweek
In [17]:
fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')
p = ax.scatter(X2[:, 0], X2[:, 1], X2[:, 2], c=dayofweek, cmap='rainbow')
ax.set_title('Unsupervised Clustering (3 Clusters) Colored by Weekday')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3');
colorbar = fig.colorbar(p)
colorbar.set_label('Weekday (0=Monday, Sunday=6)')